import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
Perform descriptive analysis. Understand the variables and their corresponding values. On the columns below, a value of zero does not make sense and thus indicates missing value:
• Glucose
• BloodPressure
• SkinThickness
• Insulin
• BMI
Visually explore these variables using histograms. Treat the missing values accordingly.
There are integer and float data type variables in this dataset. Create a count (frequency) plot describing the data types and the count of variables.
health_care_df=pd.read_csv("health care diabetes.csv")
health_care_train=pd.read_csv("health_care_train.csv")
health_care_df.head()
type(health_care_df)
#Here I am using pandas profiling because it is very useful to generate report by executing few lines of codes.
# The overview section generated by report of pandas profiling provides important statistical information
# about the data used.
# Number of variables (Categorical,Numerical,Int,float,Object,etc).
# Number of observation.
# Missing Values and Missing Values %.
# Duplicates Row and Duplicate Row %.
# Size of data
# The warning tab generate by the pandas profiling inform about the cardinality (High,Low),Correlation(high,low).
# Of the dataset, it provides the information of the variables according to its type.
from pandas_profiling import ProfileReport
profile = ProfileReport(health_care_df)
profile
# From the pandas profiling report, we can draw the following information.
# pregnacies has 111 (14.5%) zero values.
# Blood pressure has 35 (4.6%) zero values.
# skinthickness has 227 (29.6% )zero values .
# Insulin has 374 (48.7%) zero values.
# BMI has 11 (1.4%) zero values.
.Check the balance of the data by plotting the count of outcomes by their value. Describe your findings and plan future course of action.
.Create scatter charts between the pair of variables to understand the relationships. Describe your findings.
.Perform correlation analysis. Visually explore it using a heat map.
positive=health_care_df[health_care_df['Outcome']==1] # Here I am taking only the outcome with the value=1
positive.head()
sns.barplot(health_care_df.Outcome,health_care_df.Pregnancies)
sns.barplot(health_care_df.Outcome,health_care_df.Glucose)
sns.barplot(health_care_df.Outcome,health_care_df.BMI)
sns.barplot(health_care_df.Outcome,health_care_df.BloodPressure)
sns.barplot(health_care_df.Outcome,health_care_df.SkinThickness)
sns.barplot(health_care_df.Outcome,health_care_df.Insulin)
sns.barplot(health_care_df.Outcome,health_care_df.DiabetesPedigreeFunction)
sns.barplot(health_care_df.Outcome,health_care_df.Age)
#Checking the Correlation between variables
corr_feature=health_care_df.drop('Outcome',axis=1)
sns.heatmap(corr_feature.corr())
# From the above observation it is found that glucose and Insulin are
# amongst the most correlated variable.
BloodPressure=positive['BloodPressure']
Glucose=positive['Glucose']
SkinThickness=positive['SkinThickness']
Insulin=positive['Insulin']
BMI=positive['BMI']
Age=positive['Age']
plt.scatter(BloodPressure,Glucose,color=['c'])
plt.xlabel('BloodPressure')
plt.ylabel('Glucose')
plt.title('BloodPressure & Glucose' )
plt.show()
plt.scatter (SkinThickness,Age,color=['blue'])
plt.xlabel('SkinThickness')
plt.ylabel('Age')
plt.show()
plt.scatter(Age,Insulin,color=['red'])
plt.xlabel('Age')
plt.ylabel('Insulin')
plt.show()
plt.scatter(Glucose,Insulin,color=['blue'])#,data=health_care_df)
plt.xlabel('Glucose')
plt.ylabel('Insulin')
plt.title('Glucose VS Insulin')
plt.show()
G=sns.scatterplot(x='Glucose',y='BloodPressure',hue='Outcome',
data=health_care_df);
B=sns.scatterplot(x="Age",y="Insulin",hue="Outcome",data=health_care_df);
I=sns.scatterplot(x="Glucose",y="Insulin",hue="Outcome",
data=health_care_df);
health_care_df.corr()
corr_feature=health_care_df.drop('Outcome',axis=1)
sns.heatmap(corr_feature.corr())
Create Heat Map of Correlation Matrix
Correlation Values
plt.subplots(figsize=(9,9))
sns.heatmap(corr_feature.corr(),annot=True,cmap='Accent')
health_care_df.head()
features=health_care_df.iloc[:,[0,1,2,3,4,5,6,7]].values # selecting feature variables
label=health_care_df.iloc[:,8].values # Selecting target or label
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import RandomOverSampler
oversample=RandomOverSampler(sampling_strategy='minority')
features,label=oversample.fit_resample(features,label)
# checking the distribution of data after applying oversampling
sns.countplot(label)
# Train test split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test= train_test_split(features,label,test_size=0.2,random_state=10)
from sklearn.preprocessing import StandardScaler
ss=StandardScaler()
X_train_ss=ss.fit_transform(X_train)
X_test_ss=ss.transform(X_test)
# Create the model
from sklearn.linear_model import LogisticRegression
model_lr=LogisticRegression()
model_lr.fit(X_train_ss,y_train)
y_model_lr_train_pred=model_lr.predict(X_train_ss)
y_model_lr_test_pred=model_lr.predict(X_test_ss)
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
print('The accuaracy score of train dataset is:',accuracy_score(y_model_lr_train_pred,y_train))
print('The accuarcy score of test dataset is:',accuracy_score(y_model_lr_test_pred,y_test))
from sklearn.metrics import classification_report
print(classification_report(y_model_lr_test_pred,y_test))
confusion_matrix(y_model_lr_test_pred,y_test)
# applying K-fold cross validation
# Here I want to check the accuracy of the model whether it is
# overfitting or underfitting and letting to know the quality of the
# the model hence help to decide the model selection.
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
k=5
kf=KFold(n_splits=k,random_state=None)
model=model_lr
result=cross_val_score(model_lr,features,label)
print('Accuracy of each fold are:',result)
print('Average accuracy is:',np.round(result.mean(),decimals=2))
from sklearn.neighbors import KNeighborsClassifier
Ist=[]
for i in range(1,30,2):
model_knn=KNeighborsClassifier(n_neighbors=i)
model_knn.fit(X_train_ss,y_train)
#y_model_knn_train_pred=model_knn.predict(X_train)
y_model_knn_test_pred=model_knn.predict(X_test_ss)
Ist.append(np.mean(y_model_knn_test_pred !=y_test))
plt.plot(range(1,30,2),Ist,'.')
plt.plot(range(1,30,2),Ist,'--')
model_knn_1=KNeighborsClassifier(n_neighbors=1)
model_knn_1.fit(X_train_ss,y_train)
y_model_knn_train_pred_1=model_knn_1.predict(X_train_ss)
y_model_knn_test_pred_1=model_knn_1.predict(X_test_ss)
print('The accuarcy score of train dataset is:',accuracy_score(y_model_knn_train_pred_1,y_train))
print('The accuracy score of test dataset is:',accuracy_score(y_model_knn_test_pred_1,y_test))
confusion_matrix(y_model_knn_test_pred_1,y_test)
from sklearn.model_selection import cross_val_score,KFold
k=5
kf=KFold(n_splits=5,random_state=None)
model=model_knn_1
result_knn=cross_val_score(model,features,label)
print('Accuracy of each fold are:',result_knn)
print('Average accuracy is:',np.round(result_knn.mean(),decimals=2))
from xgboost import XGBClassifier
model_xgb=XGBClassifier()
model_xgb.fit(X_train_ss,y_train)
y_model_xgb_train_pred_1=model_xgb.predict(X_train_ss)
y_model_xgb_test_pred_1=model_xgb.predict(X_test_ss)
print('The accuracy score of train dataset is:',accuracy_score(y_model_xgb_train_pred_1,y_train))
print('The accuracy score of the test dataset is:',accuracy_score(y_model_xgb_test_pred_1,y_test))
from sklearn.tree import DecisionTreeClassifier
model_dtc=DecisionTreeClassifier()
model_dtc.fit(X_train_ss,y_train)
y_model_dtc_train_pred_1=model_dtc.predict(X_train_ss)
y_model_dtc_test_pred_1=model_dtc.predict(X_test_ss)
print('The accuarcy score of train dataset is:',accuracy_score(y_model_dtc_train_pred_1,y_train))
print('The accuarcy score of test dataset is: ',accuracy_score(y_model_dtc_test_pred_1,y_test))
confusion_matrix(y_model_dtc_test_pred_1,y_test)
from sklearn.ensemble import RandomForestClassifier
model_rf=RandomForestClassifier()
model_rf.fit(X_train_ss,y_train)
y_model_rf_train_pred_1=model_rf.predict(X_train_ss)
y_model_rf_test_pred_1=model_rf.predict(X_test_ss)
print('The accuracy score of train dataset is:',accuracy_score(y_model_rf_train_pred_1,y_train))
print('The accuracy score of test dataset is:',accuracy_score(y_model_rf_test_pred_1,y_test))
k=5
kf_rf=KFold()
result_rf=cross_val_score(model_rf,features,label)
print('Accuracy of each fold are:',result_rf)
print('Average accuracy is:',np.round(result_rf.mean(),decimals=2))
print(classification_report(y_model_rf_test_pred_1,y_test))
# ROC and AUC curve
# Now, here I wanted to check the model overall accuracy interms of true postive rate (TPR) and false positive rate (FPR)
# that helpst to predict the overall accuracy of the model and how good it is. In general, ROC curve represents graph between
# x-axis as 1-specificity and y-axis as sensitivity with a cutoff value. Each point in ROC curve represnts fraction of true
# positve and false posive. Hence, if the accuracy score is above 0.8 the model is good but still some false postive exists which
# is very important in a clinical test. Therefore, given the sensitivity of the data there is trade-off between TPR and FPR. In this case where
# The area under the ROC curve represents the goodness of the model and a good model covers most fraction of that area.
from sklearn.metrics import roc_auc_score,roc_curve
rf_probs=model_rf.predict_proba(X_test_ss)
knn_probs=model_knn_1.predict_proba(X_test_ss)
#rf_fpr =roc_curve(y_test,rf_probs[:,1]
#rf_tpr =roc_curve(y_test,rf_probs[:,1])
rf_fpr,rf_tpr,thresholds=roc_curve(y_test,rf_probs[:,1])
knn_fpr,knn_tpr,thresholds=roc_curve(y_test,knn_probs[:,1])
rf_roc_auc=roc_auc_score(y_test,rf_probs[:,1])
knn_roc_auc=roc_auc_score(y_test,knn_probs[:,1])
plt.title('Receiver Operating Characteristic (ROC)')
plt.plot(rf_fpr,rf_tpr,'b',
label='rf_AUC= %0.2f'%rf_roc_auc)
plt.plot(knn_fpr,knn_tpr,'r',
label='knn_AUC= %0.2f'%knn_roc_auc)
plt.plot([0,1],[0,1],'k--')
plt.xlim([-0.1,1.2])
plt.ylim([-0.1,1.2])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.legend(loc='lower right')
plt.show()
# The Roc curve dispalyed that the randomforest model has the highest accuracy which is 0.92. This is a very good model as
# accuracy score is above 0.92, but should be careful to false positive cases as it still occurs using this model.
# From above models RandomForest models perform best so I am going to deploy this model.